Setting up how many cores to use for analysis and fold-enrichment vs freqency

# cores to use for multiprocessing: is the number of cores is small -> we are not on a cluster -> use all; if the number of cores is large -> we are on cluster -> use only 15 or 7 cores (n requested-1)
if(detectCores() <= 4) cores_to_use = detectCores() - 1
if(detectCores() > 4) cores_to_use = 15

# how many permutations?
N_permut = 10

How many of the domains identified using our approach are in the ELM database (as compared to the population of all domains we tested)?

Calculate empirical pvalues using either frequency of a domain among interacting partners of viral protein or Fisher test p-value

# frequency function: set up standard parameters
permuteFrequency = function(data, select_nodes = NULL, also_permuteYZ = F){
    res = permutationPval(interactions2permute = IDs_interactor_viral ~ IDs_interactor_human, # first set of interacting pairs (XY) that are to be permuted
                          associations2test = IDs_interactor_viral ~ IDs_domain_human, # set of interacting pairs to be tested (XZ), YZ interactions are assumed
                          node_attr = list(IDs_interactor_viral ~ IDs_interactor_viral_degree,
                                           IDs_domain_human ~ domain_count,
                                           IDs_interactor_viral + IDs_domain_human ~ domain_frequency_per_IDs_interactor_viral),
                          data = data,
                          statistic = IDs_interactor_viral + IDs_domain_human ~ .N / IDs_interactor_viral_degree,
                          select_nodes = select_nodes,
                          N = N_permut,
                          cores = cores_to_use, seed = 2, also_permuteYZ = also_permuteYZ)
    return(res)
}
data = fread("./processed_data_files/viral_human_net_w_domains", sep = "\t", stringsAsFactors = F)

# frequency: all proteins and domains - # permute IDs_interactor_viral ~ IDs_interactor_human
time = proc.time()
res = permuteFrequency(data, select_nodes = NULL)
proc.time() - time
##    user  system elapsed 
##   0.993   0.064  11.759
plot(res, main = "frequency: all proteins and domains")

# permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human
time = proc.time()
res_both = permuteFrequency(data, select_nodes = NULL, also_permuteYZ = T)
proc.time() - time
##    user  system elapsed 
##   1.295   0.074  13.580
plot(res_both, main = "frequency: all proteins and domains")

# frequency: no low background count domains
res_low_back = permuteFrequency(data, select_nodes = IDs_domain_human ~ domain_count >= 3)
#res_low_back_alt = permuteFrequency(data[domain_count >= 3,])
#all.equal(res_low_back$data_with_pval[complete.cases(res_low_back$data_with_pval),p.value], res_low_back_alt$data_with_pval[complete.cases(res_low_back_alt$data_with_pval),p.value])
plot(res_low_back, main = "frequency: no low background count domains (>= 3)")

# frequency: not considering (fixing interactions, degree of every node in the network stays the same, but only high degree proteins are taken into account, equivalent to  permuting only interactions of protein with the degree higher than 1) viral proteins with the degree of 1 - removing viral proteins with the degree of 1
res_low_deg = permuteFrequency(data, select_nodes = IDs_interactor_viral ~ IDs_interactor_viral_degree >= 2)
#res_low_deg_alt = permuteFrequency(data[IDs_interactor_viral_degree >= 2,])
#all.equal(res_low_deg$data_with_pval[complete.cases(res_low_deg$data_with_pval),p.value], res_low_deg_alt$data_with_pval[complete.cases(res_low_deg_alt$data_with_pval),p.value])

plot(res_low_deg, main = "frequency: not considering viral proteins with the degree of 1")

# frequency: BOTH no low background count domains AND removing viral proteins with the degree of 1

res_low_deg_back = permuteFrequency(data, select_nodes = list(IDs_domain_human ~ domain_count >= 3,
                                                              IDs_interactor_viral ~ IDs_interactor_viral_degree >= 2))
#res_low_deg_back_alt = permuteFrequency(data[domain_count >= 3 & IDs_interactor_viral_degree >= 2,])
#all.equal(res_low_deg_back$data_with_pval[complete.cases(res_low_deg_back$data_with_pval),p.value], res_low_deg_back_alt$data_with_pval[complete.cases(res_low_deg_back_alt$data_with_pval),p.value])
plot(res_low_deg_back, main = "frequency: no low background count domains (>= 3)\nAND not considering viral proteins with the degree of 1")

# count function: set up standard parameters
permuteCount = function(data, select_nodes = NULL, also_permuteYZ = F){
    res = permutationPval(interactions2permute = IDs_interactor_viral ~ IDs_interactor_human, # first set of interacting pairs (XY) that are to be permuted
                          associations2test = IDs_interactor_viral ~ IDs_domain_human, # set of interacting pairs to be tested (XZ), YZ interactions are assumed
                          node_attr = list(IDs_interactor_viral ~ IDs_interactor_viral_degree,
                                           IDs_domain_human ~ domain_count,
                                           IDs_interactor_viral + IDs_domain_human ~ domain_frequency_per_IDs_interactor_viral),
                          data = data,
                          statistic = IDs_interactor_viral + IDs_domain_human ~ .N,
                          select_nodes = select_nodes,
                          N = N_permut,
                          cores = cores_to_use, seed = 2, also_permuteYZ = also_permuteYZ)
    return(res)
}
# count: all proteins and domains - # permute IDs_interactor_viral ~ IDs_interactor_human
time = proc.time()
res_count = permuteCount(data, select_nodes = NULL)
proc.time() - time
##    user  system elapsed 
##   1.475   0.034  11.238
plot(res_count, main = "count: all proteins and domains")

# Fisher test: set up standard parameters
permuteFisherTest = function(data, select_nodes = NULL, also_permuteYZ = F){
    resFISHER = permutationPval(interactions2permute = IDs_interactor_viral ~ IDs_interactor_human, # first set of interacting pairs (XY) that are to be permuted
                                associations2test = IDs_interactor_viral ~ IDs_domain_human, # set of interacting pairs to be tested (XZ), YZ interactions are assumed
                                node_attr = list(IDs_interactor_viral ~ IDs_interactor_viral_degree, # attribute of X
                                                 IDs_domain_human ~ domain_count + N_prot_w_interactors, # attributes of Z
                                                 IDs_interactor_viral + IDs_domain_human ~ domain_count_per_IDs_interactor_viral), # attribute of both X and Z
                                data = data, # data.table containing data
                                statistic = IDs_interactor_viral + IDs_domain_human ~ fisher.test(
                                    matrix(c(domain_count_per_IDs_interactor_viral[1], 
                                             IDs_interactor_viral_degree[1] - domain_count_per_IDs_interactor_viral[1],
                                             domain_count[1], 
                                             N_prot_w_interactors[1] - domain_count[1]),
                                           2,2), 
                                    alternative = "greater", conf.int = F)$p.value, # formula to calculate statisic by evaluating right-hand-side expression for each X and Z pair, right-hand-side expression is what is normally put in j in data.table DT[i, j, by], left-hand-side expression contains column names of X and Z which are used in by in data.table
                                select_nodes = select_nodes, # select a subset of the data, only nodes 
                                N = N_permut, # number of permutations
                                cores = cores_to_use, seed = 1, also_permuteYZ = also_permuteYZ)
    # permutationPval returns the number of cases when permuted statitic is higher than the observed statistic (right tail of the distribution), in this case we are interested in the reverse - the lower tail, when p-values from permuted distribution that are lower than the observed p-value
    resFISHER$data_with_pval[, p.value := 1 - p.value]
    return(resFISHER)
}

# contingency matrix:
matrix(c("domain_count_per_IDs_interactor_viral[1]", 
         "IDs_interactor_viral_degree[1] - domain_count_per_IDs_interactor_viral[1]",
         "domain_count[1]", 
         "N_prot_w_interactors[1] - domain_count[1]"),2,2)
##      [,1]                                                                       
## [1,] "domain_count_per_IDs_interactor_viral[1]"                                 
## [2,] "IDs_interactor_viral_degree[1] - domain_count_per_IDs_interactor_viral[1]"
##      [,2]                                       
## [1,] "domain_count[1]"                          
## [2,] "N_prot_w_interactors[1] - domain_count[1]"
# permute IDs_interactor_viral ~ IDs_interactor_human
time = proc.time()
resFISHER = permuteFisherTest(data, select_nodes = NULL)
proc.time() - time
##    user  system elapsed 
##   3.154   0.061  14.897
plot(resFISHER, main = "Fisher test P-value: all proteins and domains")

# permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human
time = proc.time()
resFISHER_both = permuteFisherTest(data, select_nodes = NULL, also_permuteYZ = T)
proc.time() - time
##    user  system elapsed 
##   3.198   0.039  14.408
plot(resFISHER_both, main = "Fisher test P-value: all proteins and domains \npermute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human")

Viral protein degree and the background domain count of top-scoring proteins

# function to accomodate ggplot2::geom_bin2d in GGally::ggpairs, taken from http://ggobi.github.io/ggally/#custom_functions
d2_bin_log10 <- function(data, mapping, ..., low = "#132B43", high = "#56B1F7") {
    ggplot(data = data, mapping = mapping) +
        geom_bin2d(...) +
        scale_fill_gradient(low = low, high = high) +
        scale_y_log10() + scale_x_log10() + annotation_logticks()
}

log10_density = function(data, mapping, ...){
    ggplot(data = data, mapping = mapping) +
        geom_density(...) +
        scale_x_log10()
}

PermutResult2D = function(res, N){
    res_temp = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human,
                                            domain_count, 
                                            IDs_interactor_viral_degree, 
                                            domain_count_per_IDs_interactor_viral,
                                            p.value)])
    GGally::ggpairs(res_temp[order(p.value, decreasing = F)[1:N],],
                    columns = c("domain_count", 
                                "IDs_interactor_viral_degree", 
                                "domain_count_per_IDs_interactor_viral",
                                "p.value"),
                    lower = list(continuous = d2_bin_log10)#, 
                    #diag = list(continuous = geom_density)
    ) +
        theme_light() +
        theme(strip.text.y = element_text(angle = 0, size = 10),
              strip.text.x = element_text(angle = 90, size = 10))
}

# frequency
PermutResult2D(res = res, N = 250) +
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

PermutResult2D(res = res_both, N = 250) +
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

PermutResult2D(res = res_low_back, N = 250) +
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n no low background count domains")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

PermutResult2D(res = res_low_deg, N = 250) +
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n no viral proteins with the degree of 1")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

PermutResult2D(res = res_low_deg_back, N = 250) +
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n no low background count domains \nAND no viral proteins with the degree of 1")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# count
PermutResult2D(res = res_count, N = 250) +
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: count of a domain among interacting partners of a viral protein")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# Fisher test p-value
PermutResult2D(res = resFISHER, N = 250) + 
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: Fisher test p-value")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

PermutResult2D(res = resFISHER_both, N = 250) + 
    ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: Fisher test p-value \n permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

Map domains known to interact with linear motifs from ELM to the domains we found

interactiondomains = fread("http://elm.eu.org/interactiondomains.tsv")
interactiondomains[, pfam_id := `Interaction Domain Id`]

domains_known = interactiondomains[, unique(pfam_id)]

InterProScan_domains = readInterProGFF3("../viral_project/processed_data_files/all_human_viral_protein_domains.gff3.gz")
# get InterProID to member database ID mapping
InterPro2memberDB = getInterPro2memberDB(InterProScan_domains)
InterPro2memberDB = InterPro2memberDB[complete.cases(InterPro2memberDB)]
domains_known_mapped = unique(InterPro2memberDB[memberDBID %in% domains_known | InterProID %in% domains_known, InterProID])
domains_not_mapped = unique(domains_known[!(domains_known %in% InterPro2memberDB$memberDBID | domains_known %in% InterPro2memberDB$InterProID)])

I did Fisher test to evaluate if the domains that we find are enriched in domains known to interact with linear motifs (from ELM). I have picked some number of viral protein - human domain associations from the top (by p-value). Then I counted how many known domains we have found and did Fisher test. I decided to compare two statistic choices (frequency of a domain among interacting partners of a viral protein or Fisher test p-value) on how many of the known domains we tend to find. Finally, I was choosing different cutoffs (different number of top p-value pairs).

testEnrichment = function(N, res, domains_known_mapped, random = F, name = ""){
    if(random) {
        res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human, p.value, domain_type, domain_count, IDs_interactor_viral_degree)])
        domains_found = res$data_pval[sample(1:nrow(res$data_with_pval), N), unique(IDs_domain_human)]
    } else {
        res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human, p.value, domain_type, domain_count, IDs_interactor_viral_degree)])
        domains_found = res$data_pval[order(p.value, decreasing = F)[1:N], unique(IDs_domain_human)]
    }
    
    alldomains = res$data_with_pval[, unique(IDs_domain_human)]
    known = factor(alldomains %in% domains_known_mapped, levels = c("TRUE", "FALSE"))
    found = factor(alldomains %in% domains_found, levels = c("TRUE", "FALSE"))
    table_res = table(known, found)
    
    test = fisher.test(table(known, found), alternative = "greater", conf.int = T)
    
    return(c(pval = test$p.value, odds_ratio = as.vector(test$estimate), count = table_res["TRUE", "TRUE"], name = name))
}
runningTestEnrichment = function(res, name){
    enrichment = sapply(Ns, testEnrichment, res, domains_known_mapped, name = "domain frequency among interactors of a viral protein")
colnames(enrichment) = Ns
return(enrichment)
}

Ns = seq(25, 500, 25)
# frequency
enrichment = runningTestEnrichment(res, name = "domain frequency among interactors of a viral protein")
enrichment_both = runningTestEnrichment(res_both, name = "domain frequency among interactors of a viral protein, permute both")
enrichment_low_back = runningTestEnrichment(res_low_back, name = "domain frequency: no low background")
enrichment_low_deg = runningTestEnrichment(res_low_deg, name = "domain frequency: no degree of 1")
enrichment_low_deg_back = runningTestEnrichment(res_low_deg_back, name = "domain frequency: no degree of 1 AND no low background")

# count
enrichment_count = runningTestEnrichment(res_count, name = "domain count among interactors of a viral protein")

# Fisher test pval
enrichmentFISHER = runningTestEnrichment(resFISHER, name = "Fisher test pval: domain overrepresentation over the background")
enrichmentFISHER_both = runningTestEnrichment(resFISHER_both, name = "Fisher test pval: domain overrepresentation over the background, permute both")

random_domains = function(N = 100, seed = seed, Ns = seq(25, 500, 25)){
    set.seed(seed)
    
    quantiles = c(0.975, 0.75, 0.5, 0.25, 0.025)
    quantile_names = c("97.5% quantile", "75% quantile", "median", "25% quantile", "2.5% quantile")
    
    pval_temp = replicate(N, {
        enrichmentRANDOM = sapply(Ns, testEnrichment, res, domains_known_mapped, random = T, name = "N random proteins")[1,]
        names(enrichmentRANDOM) = Ns
        as.numeric(enrichmentRANDOM)
    })
    pval = apply(pval_temp, 1, quantile, probs = quantiles)
    rownames(pval) = quantile_names
    colnames(pval) = Ns
    
    odds_ratio_temp = replicate(N, {
        enrichmentRANDOM = sapply(Ns, testEnrichment, res, domains_known_mapped, random = T, name = "N random proteins")[2,]
        names(enrichmentRANDOM) = Ns
        as.numeric(enrichmentRANDOM)
    })
    odds_ratio = apply(odds_ratio_temp, 1, quantile, probs = quantiles)
    rownames(odds_ratio) = quantile_names
    colnames(odds_ratio) = Ns
    
    count_temp = replicate(N, {
        enrichmentRANDOM = sapply(Ns, testEnrichment, res, domains_known_mapped, random = T, name = "N random proteins")[3,]
        names(enrichmentRANDOM) = Ns
        as.numeric(enrichmentRANDOM)
    })
    count = apply(count_temp, 1, quantile, probs = quantiles)
    rownames(count) = quantile_names
    colnames(count) = Ns
    
    return(list(pval = pval, odds_ratio = odds_ratio, count = count))
}

enrichmentRANDOM = random_domains(1000, 1)

As we include more proteins, the number of known domains we find increases and then levels off (probably because some of the known domains do not interact with viral proteins).

plotEnrichment = function(..., random_domains = NULL, domains_known_mapped, type = "count", plot_type = plot_type){
    
    res = list(...)
    typenum = match(type, c("pval", "odds_ratio", "count"))
    ngroups = length(res)
    
    if(type == "count") color = brewer.pal(ngroups + 1, "Dark2") else color = brewer.pal(ngroups, "Dark2")
    if(is.na(typenum)) stop("'type' should be one of “count”, “odds_ratio”, “pval”")
    
    leg_pos_y = max(sapply(res, function(x, typenum) max(as.numeric(x[typenum,])), typenum))
    if(!is.null(random_domains)) leg_pos_y = max(leg_pos_y, random_domains[typenum][[1]])
    if(type == "count") leg_pos_y = length(domains_known_mapped) - 1
    leg_pos_x = max(sapply(res, function(x) max(as.numeric(colnames(x))))) * 0.16
    
    if(type == "pval") {ylim = c(0, 1); ylab = "p-value"}
    if(type == "count") {ylim = c(0,length(domains_known_mapped)+1); ylab = "known domain found"}
    if(type == "odds_ratio") {ylim = c(0,leg_pos_y); ylab = "Fisher test odds ratio"}
    
    plot(colnames(res[[1]]),rep(0,ncol(res[[1]])), 
         ylab = ylab, xlab = "top N viral protein - domain pairs selected",
         type = plot_type, ylim = ylim, lwd = 0)
    # plot random domains quantiles
    if(!is.null(random_domains)){
        random_legend = c("97.5% quantile", "75% quantile", "median", "25% quantile", "2.5% quantile")
        random_cols = c("#DDDDDD", "#CCCCCC", "#AAAAAA", "#CCCCCC", "#DDDDDD")
        random_line_width = c(2,4,8,4,2)
        for (i in 1:5) {
            lines(x = colnames(random_domains[typenum][[1]]), y = random_domains[typenum][[1]][random_legend[i],], col = random_cols[i], lwd = random_line_width[i], type = plot_type)
        }
    }
    
    for (i in 1:ngroups) {
        lines(x = colnames(res[[i]]), y = res[[i]][typenum,], col = color[i], type = plot_type, lwd = 3)
    }
    
    if(type == "count") abline(h = length(domains_known_mapped), col = color[ngroups + 1])
    
    legend_names = c("statictic used in permutation test:")
    for(i in 1:ngroups){
        legend_names = c(legend_names, unique(res[[i]]["name",]))
    }
    
    if(type == "count") legend_names = c(legend_names, "domains known to interact with linear motifs")
    
    line_width = rep(3, length(color) + 1)
    
    if(!is.null(random_domains)){
        legend_names = c(legend_names, paste0("N random protein-domain pairs, ", random_legend))
        color = c(color, random_cols)
        line_width = c(line_width, random_line_width)
    }
    
    
    legend(x = leg_pos_x, y = leg_pos_y, legend_names, 
           col = c("white", color), lty = 1, lwd = line_width, merge = TRUE)
}

plotEnrichment(enrichment, enrichment_both, enrichment_low_back, enrichment_low_deg, enrichment_low_deg_back,
               enrichment_count, enrichmentFISHER, enrichmentFISHER_both,
               random_domains = enrichmentRANDOM, 
               domains_known_mapped = domains_known_mapped, type = "count", plot_type = "l")
## Warning in brewer.pal(ngroups + 1, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

As we include more proteins, the Fisher test odds ratio decreases (we add more stuff that is not known). Odds ratio measures how much more likely are we to find a domain using our procedure if it’s a known domain as compared to if it’s not a known domain.

plotEnrichment(enrichment, enrichment_both, enrichment_low_back, enrichment_low_deg, enrichment_low_deg_back,
               enrichment_count, enrichmentFISHER, enrichmentFISHER_both,
               random_domains = enrichmentRANDOM, 
               domains_known_mapped = domains_known_mapped, type = "odds_ratio", plot_type = "l")

corresponding P-values from the Fisher test

plotEnrichment(enrichment, enrichment_both, enrichment_low_back, enrichment_low_deg, enrichment_low_deg_back,
               enrichment_count, enrichmentFISHER, enrichmentFISHER_both,
               random_domains = enrichmentRANDOM, 
               domains_known_mapped = domains_known_mapped, type = "pval", plot_type = "l")

How many viral proteins are known per each of the domain instances in top 200 protein-domain pairs?

selectTopHits = function(res, N){
    res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human, p.value, domain_type, domain_count, IDs_interactor_viral_degree)])
    pairs200pval = res$data_pval[order(p.value, decreasing = F)[1:N], max(p.value)]
    
    restop100 = res
    restop100$data_with_pval = restop100$data_with_pval[p.value <= pairs200pval, ]
    restop100$data_with_pval[, N_viral_per_human_w_domain := length(unique(IDs_interactor_viral)), by = .(IDs_interactor_human, IDs_domain_human)]
    return(restop100)
}
restop100 = selectTopHits(res, N = 250)
plot(restop100, IDs_interactor_human ~ N_viral_per_human_w_domain)

plot(restop100)

49 human proteins with enriched domains have 5 or more viral interacting partners.
11 human proteins with enriched domains have 10 or more viral interacting partners.

what are those domains? are they known ELM-interacting domains? which proteins they are in? which viral taxons they interact with?

restop100$data_with_pval[N_viral_per_human_w_domain >= 10, unique(IDs_domain_human)]
## [1] "IPR000504" "IPR001452" "IPR001478" "IPR018316"
restop100$data_with_pval[N_viral_per_human_w_domain >= 10, unique(IDs_domain_human)] %in% domains_known_mapped
## [1]  TRUE  TRUE  TRUE FALSE
restop100$data_with_pval[N_viral_per_human_w_domain >= 10 & IDs_domain_human == "IPR000504", unique(IDs_interactor_human)]
## [1] "O43390" "P09651" "P11940" "P22626" "P31943" "P38159" "Q99729"
restop100$data_with_pval[N_viral_per_human_w_domain >= 10 & IDs_domain_human == "IPR000504", unique(Taxid_interactor_viral)]
##  [1]  10254 380964  88776 130763  10299 270485 211044  10377  10376  10298
## [11] 680716  28344 333284 128952  11097  10884 121791  10249 796210 381518
## [21] 387139 643960
DT::datatable(restop100$data_with_pval[N_viral_per_human_w_domain >= 10,])

R session information

save(list = ls(), file="./processed_data_files/what_we_find_VS_ELM_clust.RData")
Sys.Date()
## [1] "2017-09-02"
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2   GGally_1.3.2         ggplot2_2.2.1       
##  [4] rtracklayer_1.36.4   GenomicRanges_1.28.4 GenomeInfoDb_1.12.2 
##  [7] MItools_0.1.14       Biostrings_2.44.2    XVector_0.16.0      
## [10] data.table_1.10.4    PSICQUIC_1.14.0      plyr_1.8.4          
## [13] httr_1.3.0           biomaRt_2.32.1       IRanges_2.10.2      
## [16] S4Vectors_0.14.3     BiocGenerics_0.22.0 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.12               lattice_0.20-35           
##  [3] Rsamtools_1.28.0           rprojroot_1.2             
##  [5] digest_0.6.12              R6_2.2.2                  
##  [7] backports_1.1.0            RSQLite_2.0               
##  [9] evaluate_0.10.1            zlibbioc_1.22.0           
## [11] rlang_0.1.2                lazyeval_0.2.0            
## [13] blob_1.1.0                 R.utils_2.5.0             
## [15] R.oo_1.21.0                Matrix_1.2-11             
## [17] DT_0.2                     qvalue_2.8.0              
## [19] rmarkdown_1.6              gsubfn_0.6-6              
## [21] proto_1.0.0                labeling_0.3              
## [23] splines_3.4.1              BiocParallel_1.10.1       
## [25] stringr_1.2.0              htmlwidgets_0.9           
## [27] RCurl_1.95-4.8             bit_1.1-12                
## [29] munsell_0.4.3              DelayedArray_0.2.7        
## [31] compiler_3.4.1             htmltools_0.3.6           
## [33] SummarizedExperiment_1.6.3 tibble_1.3.3              
## [35] GenomeInfoDbData_0.99.0    matrixStats_0.52.2        
## [37] XML_3.98-1.9               reshape_0.8.7             
## [39] GenomicAlignments_1.12.1   bitops_1.0-6              
## [41] R.methodsS3_1.7.1          grid_3.4.1                
## [43] jsonlite_1.5               gtable_0.2.0              
## [45] DBI_0.7                    magrittr_1.5              
## [47] scales_0.4.1               stringi_1.1.5             
## [49] reshape2_1.4.2             tools_3.4.1               
## [51] bit64_0.9-7                Biobase_2.36.2            
## [53] yaml_2.1.14                AnnotationDbi_1.38.2      
## [55] colorspace_1.3-2           memoise_1.1.0             
## [57] knitr_1.17